{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 222,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import datasets\n",
    "import pandas as pd\n",
    "import math\n",
    "iris = datasets.load_iris()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 223,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5.1</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4.9</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4.7</td>\n",
       "      <td>3.2</td>\n",
       "      <td>1.3</td>\n",
       "      <td>0.2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.6</td>\n",
       "      <td>3.1</td>\n",
       "      <td>1.5</td>\n",
       "      <td>0.2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5.0</td>\n",
       "      <td>3.6</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     0    1    2    3  0\n",
       "0  5.1  3.5  1.4  0.2  0\n",
       "1  4.9  3.0  1.4  0.2  0\n",
       "2  4.7  3.2  1.3  0.2  0\n",
       "3  4.6  3.1  1.5  0.2  0\n",
       "4  5.0  3.6  1.4  0.2  0"
      ]
     },
     "execution_count": 223,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1=pd.DataFrame(iris.data)\n",
    "df2=pd.DataFrame(iris.target)\n",
    "df=pd.concat([df1,df2],axis=1)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 224,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.columns = [\"sl\", \"sw\", 'pl', 'pw','Label']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 225,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sl</th>\n",
       "      <th>sw</th>\n",
       "      <th>pl</th>\n",
       "      <th>pw</th>\n",
       "      <th>Label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5.1</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4.9</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4.7</td>\n",
       "      <td>3.2</td>\n",
       "      <td>1.3</td>\n",
       "      <td>0.2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.6</td>\n",
       "      <td>3.1</td>\n",
       "      <td>1.5</td>\n",
       "      <td>0.2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5.0</td>\n",
       "      <td>3.6</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    sl   sw   pl   pw  Label\n",
       "0  5.1  3.5  1.4  0.2      0\n",
       "1  4.9  3.0  1.4  0.2      0\n",
       "2  4.7  3.2  1.3  0.2      0\n",
       "3  4.6  3.1  1.5  0.2      0\n",
       "4  5.0  3.6  1.4  0.2      0"
      ]
     },
     "execution_count": 225,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 226,
   "metadata": {},
   "outputs": [],
   "source": [
    "def label(val, *boundaries):\n",
    "    if (val < boundaries[0]):\n",
    "        return 'a'\n",
    "    elif (val < boundaries[1]):\n",
    "        return 'b'\n",
    "    elif (val < boundaries[2]):\n",
    "        return 'c'\n",
    "    else:\n",
    "        return 'd'\n",
    "\n",
    "def toLabel(df, old_feature_name):\n",
    "    second = df[old_feature_name].mean()\n",
    "    minimum = df[old_feature_name].min()\n",
    "    first = (minimum + second)/2\n",
    "    maximum = df[old_feature_name].max()\n",
    "    third = (maximum + second)/2\n",
    "    return df[old_feature_name].apply(label, args= (first, second, third))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 227,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['sl_labeled'] = toLabel(df, 'sl')\n",
    "df['sw_labeled'] = toLabel(df, 'sw')\n",
    "df['pl_labeled'] = toLabel(df, 'pl')\n",
    "df['pw_labeled'] = toLabel(df, 'pw')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 228,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sl</th>\n",
       "      <th>sw</th>\n",
       "      <th>pl</th>\n",
       "      <th>pw</th>\n",
       "      <th>Label</th>\n",
       "      <th>sl_labeled</th>\n",
       "      <th>sw_labeled</th>\n",
       "      <th>pl_labeled</th>\n",
       "      <th>pw_labeled</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5.1</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>0</td>\n",
       "      <td>b</td>\n",
       "      <td>c</td>\n",
       "      <td>a</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4.9</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>0</td>\n",
       "      <td>a</td>\n",
       "      <td>b</td>\n",
       "      <td>a</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4.7</td>\n",
       "      <td>3.2</td>\n",
       "      <td>1.3</td>\n",
       "      <td>0.2</td>\n",
       "      <td>0</td>\n",
       "      <td>a</td>\n",
       "      <td>c</td>\n",
       "      <td>a</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.6</td>\n",
       "      <td>3.1</td>\n",
       "      <td>1.5</td>\n",
       "      <td>0.2</td>\n",
       "      <td>0</td>\n",
       "      <td>a</td>\n",
       "      <td>c</td>\n",
       "      <td>a</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5.0</td>\n",
       "      <td>3.6</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>0</td>\n",
       "      <td>a</td>\n",
       "      <td>c</td>\n",
       "      <td>a</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    sl   sw   pl   pw  Label sl_labeled sw_labeled pl_labeled pw_labeled\n",
       "0  5.1  3.5  1.4  0.2      0          b          c          a          a\n",
       "1  4.9  3.0  1.4  0.2      0          a          b          a          a\n",
       "2  4.7  3.2  1.3  0.2      0          a          c          a          a\n",
       "3  4.6  3.1  1.5  0.2      0          a          c          a          a\n",
       "4  5.0  3.6  1.4  0.2      0          a          c          a          a"
      ]
     },
     "execution_count": 228,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 229,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.drop(['sl', 'sw', 'pl', 'pw'], axis = 1, inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 230,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Label</th>\n",
       "      <th>sl_labeled</th>\n",
       "      <th>sw_labeled</th>\n",
       "      <th>pl_labeled</th>\n",
       "      <th>pw_labeled</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>b</td>\n",
       "      <td>c</td>\n",
       "      <td>a</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>a</td>\n",
       "      <td>b</td>\n",
       "      <td>a</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>a</td>\n",
       "      <td>c</td>\n",
       "      <td>a</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>a</td>\n",
       "      <td>c</td>\n",
       "      <td>a</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>a</td>\n",
       "      <td>c</td>\n",
       "      <td>a</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Label sl_labeled sw_labeled pl_labeled pw_labeled\n",
       "0      0          b          c          a          a\n",
       "1      0          a          b          a          a\n",
       "2      0          a          c          a          a\n",
       "3      0          a          c          a          a\n",
       "4      0          a          c          a          a"
      ]
     },
     "execution_count": 230,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 253,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "150"
      ]
     },
     "execution_count": 253,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['Label'].count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 254,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['sl_labeled', 'sw_labeled', 'pl_labeled', 'pw_labeled']\n"
     ]
    }
   ],
   "source": [
    "unused_features = df.columns[1:5]\n",
    "unused_features = list(unused_features)\n",
    "print(unused_features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 263,
   "metadata": {},
   "outputs": [],
   "source": [
    "def entropy(s):\n",
    "    e=0\n",
    "    for val in s.value_counts():\n",
    "        p=val/s.count()\n",
    "        e+= -1*p*(math.log(p,2))\n",
    "    return e\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 284,
   "metadata": {},
   "outputs": [],
   "source": [
    "def buildtree(df,unused_features,level):\n",
    "    l=[]        \n",
    "    if(unused_features!=[] and entropy(df['Label'])!=0):\n",
    "\n",
    "        for f in unused_features:\n",
    "            possible_vals=set(df[f])\n",
    "            comb_entropy=0\n",
    "            for val in possible_vals:\n",
    "                s=df[df[f]==val]['Label']\n",
    "                comb_entropy+=entropy(s)*(s.count()/df['Label'].count())\n",
    "            gain=entropy(df['Label'])-comb_entropy\n",
    "            l.append(gain)\n",
    "    \n",
    "        maxgain=max(l)\n",
    "        best_feature = unused_features[l.index(maxgain)]\n",
    "        print(\"Level\",level)\n",
    "        print(\"Current Entropy is\",entropy(df['Label']))\n",
    "        values=df['Label'].value_counts().keys().tolist()\n",
    "        for val in values:\n",
    "            print(\"Count of \",val,\"is\",df['Label'].value_counts(ascending=True)[val])\n",
    "        print(\"Splitting on feature\",best_feature,\"with gain\",maxgain)\n",
    "        print()\n",
    "        newunused_features=unused_features.copy()\n",
    "        newunused_features.remove(best_feature)\n",
    "    \n",
    "    \n",
    "        possible_vals=set(df[best_feature])\n",
    "        for val in possible_vals:\n",
    "            buildtree(df[df[best_feature]==val],newunused_features,level+1)\n",
    "    \n",
    "    else:\n",
    "        print(\"Level\",level)\n",
    "        print(\"Current Entropy is\",entropy(df['Label']))\n",
    "        values=df['Label'].value_counts().keys().tolist()\n",
    "        for val in values:\n",
    "            print(\"Count of\" ,val,\"is\",df['Label'].value_counts(ascending=True)[val])\n",
    "        print(\"Reached leaf node\")\n",
    "        print()\n",
    "        return\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 285,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.584962500721156"
      ]
     },
     "execution_count": 285,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "entropy(df['Label'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 286,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Level 0\n",
      "Current Entropy is 1.584962500721156\n",
      "Count of  2 is 50\n",
      "Count of  1 is 50\n",
      "Count of  0 is 50\n",
      "Splitting on feature pw_labeled with gain 1.2627308217896138\n",
      "\n",
      "Level 1\n",
      "Current Entropy is 0.0\n",
      "Count of 0 is 50\n",
      "Reached leaf node\n",
      "\n",
      "Level 1\n",
      "Current Entropy is 0.863120568566631\n",
      "Count of  1 is 40\n",
      "Count of  2 is 16\n",
      "Splitting on feature pl_labeled with gain 0.3107100419019655\n",
      "\n",
      "Level 2\n",
      "Current Entropy is 0.0\n",
      "Count of 1 is 1\n",
      "Reached leaf node\n",
      "\n",
      "Level 2\n",
      "Current Entropy is 0.6581912658132185\n",
      "Count of  1 is 39\n",
      "Count of  2 is 8\n",
      "Splitting on feature sl_labeled with gain 0.15790810784426046\n",
      "\n",
      "Level 3\n",
      "Current Entropy is 0.0\n",
      "Count of 1 is 14\n",
      "Reached leaf node\n",
      "\n",
      "Level 3\n",
      "Current Entropy is 0.0\n",
      "Count of 1 is 2\n",
      "Reached leaf node\n",
      "\n",
      "Level 3\n",
      "Current Entropy is 0.783776947484701\n",
      "Count of  1 is 23\n",
      "Count of  2 is 7\n",
      "Splitting on feature sw_labeled with gain 0.08807926473635475\n",
      "\n",
      "Level 4\n",
      "Current Entropy is 0.8812908992306927\n",
      "Count of 1 is 14\n",
      "Count of 2 is 6\n",
      "Reached leaf node\n",
      "\n",
      "Level 4\n",
      "Current Entropy is 0.0\n",
      "Count of 1 is 6\n",
      "Reached leaf node\n",
      "\n",
      "Level 4\n",
      "Current Entropy is 0.8112781244591328\n",
      "Count of 1 is 3\n",
      "Count of 2 is 1\n",
      "Reached leaf node\n",
      "\n",
      "Level 3\n",
      "Current Entropy is 0.0\n",
      "Count of 2 is 1\n",
      "Reached leaf node\n",
      "\n",
      "Level 2\n",
      "Current Entropy is 0.0\n",
      "Count of 2 is 8\n",
      "Reached leaf node\n",
      "\n",
      "Level 1\n",
      "Current Entropy is 0.0\n",
      "Count of 2 is 34\n",
      "Reached leaf node\n",
      "\n",
      "Level 1\n",
      "Current Entropy is 0.0\n",
      "Count of 1 is 10\n",
      "Reached leaf node\n",
      "\n"
     ]
    }
   ],
   "source": [
    "buildtree(df,unused_features,0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
